GIS Demonstration.R 2024, February Tim Norris tnorris@miami.edu
This is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or any later version.
This code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. https://www.gnu.org/licenses/.
# https://leafletjs.com
library(leaflet)
# https://r-spatial.github.io/sf/articles/sf1.html
library(sf)
## Warning: package 'sf' was built under R version 4.0.5
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(jqr)
library(curl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:jqr':
##
## vars
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:jqr':
##
## combine, contains, do, do_, funs, select, select_, vars
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(spdep)
## Loading required package: sp
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(spatialreg)
## Loading required package: Matrix
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## aple, aple.mc, aple.plot, as_dgRMatrix_listw, as_dsCMatrix_I,
## as_dsCMatrix_IrW, as_dsTMatrix_listw, as.spam.listw, can.be.simmed,
## cheb_setup, create_WX, do_ldet, eigen_pre_setup, eigen_setup,
## eigenw, errorsarlm, get.ClusterOption, get.coresOption,
## get.mcOption, get.VerboseOption, get.ZeroPolicyOption,
## GMargminImage, GMerrorsar, griffith_sone, gstsls, Hausman.test,
## impacts, intImpacts, invIrM, invIrW, Jacobian_W, jacobianSetup,
## l_max, lagmess, lagsarlm, lextrB, lextrS, lextrW, lmSLX, localAple,
## LU_prepermutate_setup, LU_setup, Matrix_J_setup, Matrix_setup,
## mcdet_setup, MCMCsamp, ME, mom_calc, mom_calc_int2, moments_setup,
## powerWeights, sacsarlm, SE_classic_setup, SE_interp_setup,
## SE_whichMin_setup, set.ClusterOption, set.coresOption,
## set.mcOption, set.VerboseOption, set.ZeroPolicyOption,
## similar.listw, spam_setup, spam_update_setup, SpatialFiltering,
## spautolm, spBreg_err, spBreg_lag, spBreg_sac, stsls,
## subgraph_eigenw, trW
library(spgwr)
## NOTE: This package does not constitute approval of GWR
## as a method of spatial analysis; see example(gwr)
from: https://gdsc.idsc.miami.edu explore the metadata at: https://gdsc.idsc.miami.edu/detail/mdc_2021_acs_5yr_tract_dvmt
base_api_url <- "https://gdsc.idsc.miami.edu/functions/"
table_name <- "mdc_2021_acs_5yr_tract_dvmt_geojson"
full_api_url <- paste0(base_api_url,table_name)
print(full_api_url)
## [1] "https://gdsc.idsc.miami.edu/functions/mdc_2021_acs_5yr_tract_dvmt_geojson"
Note that EPSG 4269 is the native coordinate reference system:
NAD83 lat/lng
EPSG 2236: NAD83 State Plane Florida East Feet (for measured
distances).
EPSG 3857: WGS84 Web Mercator (for visualizing in RStudio on the
internet).
tracts_sp <- sf::st_transform(tracts,crs='EPSG:2236')
tracts <- sf::st_transform(tracts,crs='EPSG:4326') # WGS84 lat/lng
rm(tracts_geojson)
rm(tracts_json)
class(tracts)
## [1] "sf" "data.frame"
attr(tracts,"sf_column")
## [1] "geometry"
print(tracts[c(1,2,3,4,15)], n=3)
## Simple feature collection with 707 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -80.8736 ymin: 25.13742 xmax: -80.06279 ymax: 25.97943
## Geodetic CRS: WGS 84
## First 3 features:
## ogc_fid statefp countyfp tractce total_population
## 1 2045 12 086 009312 5468
## 2 3613 12 086 009318 1560
## 3 1135 12 086 012402 3442
## geometry
## 1 MULTIPOLYGON (((-80.3139 25...
## 2 MULTIPOLYGON (((-80.32556 2...
## 3 MULTIPOLYGON (((-80.35072 2...
names(tracts)
## [1] "ogc_fid"
## [2] "statefp"
## [3] "countyfp"
## [4] "tractce"
## [5] "geoid"
## [6] "name"
## [7] "namelsad"
## [8] "mtfcc"
## [9] "funcstat"
## [10] "aland"
## [11] "awater"
## [12] "intptlat"
## [13] "intptlon"
## [14] "geom_local"
## [15] "total_population"
## [16] "persons_4_years_old_and_younger"
## [17] "age_of_5_and_19"
## [18] "persons_between_the_age_of_20_and_64"
## [19] "persons_65_years_old_and_over"
## [20] "white_alone"
## [21] "black_or_african_american_alone"
## [22] "american_indian_and_alaska_native_alone"
## [23] "asian_alone"
## [24] "pacific_islander"
## [25] "some_other_race"
## [26] "two_or_more_races"
## [27] "hispanic_or_latino__any_race"
## [28] "white_alone__not_hispanic_or_latino"
## [29] "high_school_degree__ged_or_lower"
## [30] "some_college__no_degree"
## [31] "associates_degree"
## [32] "bachelors_degree"
## [33] "graduate_professional_degree"
## [34] "median_income_of_residents"
## [35] "median_monthly_housing_costs"
## [36] "median_monthly_owner_costs"
## [37] "median_monthly_renter_costs"
## [38] "median_monthly_owner_costs__percent_household_income"
## [39] "median_monthly_renter_costs__percent_household_income"
## [40] "aggregate_household_income"
## [41] "households_below_poverty_level"
## [42] "medicare_recipients"
## [43] "total_medicaid_recipients"
## [44] "total_snap_recipients"
## [45] "armed_forces_veterans"
## [46] "total_number_of_disabled_residents"
## [47] "total_housing_units"
## [48] "total_single_family_houses"
## [49] "total_duplex_multiple_units__less_than_10_units"
## [50] "number_of_housing_structures_with_10_19_units"
## [51] "number_of_housing_structures_with_20_49_units"
## [52] "number_of_housing_structures_with_50_or_more_units"
## [53] "housing_tenancy__owner_occupied_housing_units"
## [54] "housing_tenancy__renter_occupied_housing_units"
## [55] "geometry"
summary(tracts$median_income_of_residents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -666666666 40310 58393 -23508980 79828 250001
tracts$median_income_of_residents <- na_if(
tracts$median_income_of_residents,-666666666
)
summary(tracts$median_income_of_residents)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16014 42893 60390 67182 81380 250001 25
summary(tracts$total_population)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2762 3802 3805 4778 10215
summary(tracts$white_alone)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1355 2137 2150 2869 5647
summary(tracts$white_alone/tracts$total_population)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.5015 0.6261 0.5781 0.7127 1.0000 9
summary(tracts$households_below_poverty_level)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 256.0 487.0 589.0 802.5 4042.0
summary(tracts$total_housing_units)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1075 1445 1506 1848 4617
summary(tracts$households_below_poverty_level/tracts$total_housing_units)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.1896 0.3476 0.4187 0.5454 4.1543 10
First set some color bins for the data.
Note first ‘palette’ parameter comes from https://colorbrewer2.org.
You can experiment with different palettes.
median_income_pal <- colorBin(
palette = "YlGnBu", # color ramp
tracts$median_income_of_residents, # data column
7 # number of bins
)
Set output dimension
output.height <- '800px'
Create map object
m_median_income <- leaflet(height=800) %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addPolygons(
data = tracts,
fillColor = ~median_income_pal(median_income_of_residents),
fillOpacity = 0.8,
weight = 0.3,
label = ~paste('median income in US$: ',median_income_of_residents),
labelOptions = labelOptions(style = list(
"font-weight" = "normal",
padding = "3px 8px"),
textsize = "13px", direction = "auto"),
highlight = highlightOptions(color = "black", weight = 4, bringToFront = TRUE))
# draw the map
m_median_income
Make a normalized column (percentages better for maps).
tracts$percent_white <- tracts$white_alone/tracts$total_population
First set some color bins for the data.
percent_white_pal <- colorBin(
"OrRd",
tracts$percent_white,
7)
Set output dimension
output.height <- '800px'
Create map object.
m_percent_white <- leaflet(height=800) %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addPolygons(
data = tracts,
fillColor = ~percent_white_pal(tracts$percent_white),
fillOpacity = 0.8,
weight = 0.3,
label = ~paste('percent white: ',tracts$percent_white),
labelOptions = labelOptions(style = list(
"font-weight" = "normal",
padding = "3px 8px"),
textsize = "13px", direction = "auto"),
popup = ~paste('<strong>white_alone:',white_alone,'</strong><br>Total Pop:',total_population),
highlight = highlightOptions(color = "black", weight = 4, bringToFront = TRUE))
# draw the map
m_percent_white
We will model median income as a function of percent white.
What do you expect this model to look like? Which way will the
correlation be sloped?
Make a linear regression model and look at the summary.
nonspatial <- lm(percent_white~median_income_of_residents,data=tracts)
summary(nonspatial)
##
## Call:
## lm(formula = percent_white ~ median_income_of_residents, data = tracts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60349 -0.09531 0.03760 0.12480 0.45003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.246e-01 1.554e-02 27.32 <2e-16 ***
## median_income_of_residents 2.273e-06 2.061e-07 11.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1846 on 680 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.1518, Adjusted R-squared: 0.1505
## F-statistic: 121.7 on 1 and 680 DF, p-value: < 2.2e-16
It looks like we have a strong relationship!! The p-values for the intercept and percent_with predictor are very low.
Make a plot of the basic linear model.
options(repr.plot.width = 8, repr.plot.height = 8)
plot(tracts$percent_white,tracts$median_income_of_residents)
abline(nonspatial, col="red")
Make a basic q-q plot.
qqnorm(nonspatial$residuals)
Calculate the residuals
x = data.frame(
actual = nonspatial$fitted.values + nonspatial$residuals,
fitted = nonspatial$fitted.values)
x = x[order(x$fitted),]
Make a quick plot of the residuals
plot(x$actual, col='#00000040', ylab="percent white", las=1)
lines(x$fitted, col='#c00000', lwd=3)
The model suggests a relationship between percent white and median
income exists,
BUT The R-squared suggests only 15% of explanation, seems that there is
a lot
of other stuff going on as well. This lack of explanation is also shown
in the
residual plot directly above (not tightly clustered). The Q-Q plot
suggests a
distribution that is skewed to the right.
Median income
plot(tracts_sp[34])
tracts_sp$median_income_of_residents <- na_if(
tracts_sp$median_income_of_residents,-666666666
)
plot(tracts_sp[34])
Normalized percent white
tracts_sp$percent_white <- tracts_sp$white_alone/tracts_sp$total_population
plot(tracts_sp[56])
There is a basic problem in all geo-spatial data analysis related
to
Tobler’s First Law: everything is related to everything else, but
near
things are more related than distant things. This is reflected in
the
skewed Q-Q plot … our independent variable is not independent,
but
but instead varies according to spatial proximity to other data
points.
This is known as spatial autocorrelation. The Moran’s I statistic
can
confirm if we have spatial autocorrelation in our data.
https://rspatial.org/raster/analysis/3-spauto.html
\(I = \frac{n}{\sum_{i=1}^n (y_i - \bar{y})^2} \frac{\sum_{i=1}^n \sum_{j=1}^n w_{ij}(y_i - \bar{y})(y_j - \bar{y})}{\sum_{i=1}^n \sum_{j=1}^n w_{ij}}\)
First remove all NA values
tracts_sp <- na.omit(tracts_sp)
Create a list of neighbors for each polygon.
w <- poly2nb(tracts_sp, row.names=tracts_sp$fips)
class(w)
## [1] "nb"
summary(w)
## Neighbour list object:
## Number of regions: 682
## Number of nonzero links: 4254
## Percentage nonzero weights: 0.9145948
## Average number of links: 6.237537
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 7 31 71 126 169 133 67 44 17 12 1 1 2
## 1 least connected region:
## 499 with 1 link
## 2 most connected regions:
## 12 184 with 14 links
xy <- st_centroid(tracts_sp$geometry)
class(xy)
## [1] "sfc_POINT" "sfc"
Transform neighbor object to a sf object. Note that we have to set the CRS otherwise the coordinates are just numbers and not actually located on the Earth’s surface.
w_sf <- as(nb2lines(w, coords = xy), 'sf')
w_sf <- st_set_crs(w_sf, st_crs(tracts_sp))
Make plot
options(repr.plot.width = 16, repr.plot.height = 16)
ggplot() +
geom_sf(data=tracts_sp,aes(fill = percent_white)) +
scale_fill_binned(type = "viridis") +
geom_sf(data=xy,color="white",size=0.5) +
geom_sf(data=w_sf,color="white",alpha=0.25) +
coord_sf(datum=NA)
Transform neighbors into spatial weights matrix
wm <- nb2mat(w, style='B')
class(wm)
## [1] "matrix" "array"
Get the matrix as a list
ww <- nb2listw(w, style='B')
class(ww)
## [1] "listw" "nb"
Calculate Moran’s I (from spdep library)
moran(tracts_sp$percent_white, ww, n=length(ww$neighbours), S0=Szero(ww))
## $I
## [1] 0.6794024
##
## $K
## [1] 3.337243
Moran’s I varies from -1 to 1. The further away from 0, the more
spatial
autocorrelation - with a value of 0.68, we clearly show spatial
autocorrelation.
moran.test(nonspatial$residuals, ww, zero.policy=T)
##
## Moran I test under randomisation
##
## data: nonspatial$residuals
## weights: ww
##
## Moran I statistic standard deviate = 29.588, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.6364672256 -0.0014684288 0.0004648754
moran.plot(nonspatial$residuals, ww, zero.policy=T)
The Moran’s I statistic of 0.45 for the residuals, and the Moran plot
of the
residuals showing a slight upward trend from left to right, confirms
this
observation of spatial autocorrelation.
Instead of the approach above you should use Monte Carlo simulation.
That
is the preferred method (in fact, the only good method). The way it
works
that the values are randomly assigned to the polygons, and the Moran’s
I
is computed. This is repeated several times to establish a distribution
of
expected values. The observed value of Moran’s I is then compared with
the
simulated distribution to see how likely it is that the observed
values
could be considered a random draw.
For percent_white
moran.mc(tracts_sp$percent_white, ww, nsim=99)
##
## Monte-Carlo simulation of Moran I
##
## data: tracts_sp$percent_white
## weights: ww
## number of simulations + 1: 100
##
## statistic = 0.6794, observed rank = 100, p-value = 0.01
## alternative hypothesis: greater
For median_income
moran.mc(tracts_sp$median_income, ww, nsim=99)
##
## Monte-Carlo simulation of Moran I
##
## data: tracts_sp$median_income
## weights: ww
## number of simulations + 1: 100
##
## statistic = 0.49954, observed rank = 100, p-value = 0.01
## alternative hypothesis: greater
From the global Moran’s I we can conclude that both the percent_whit
and
median_income are clustered globally. The assumption that the
predictor
percent_white is independent is violated. Both variables show
spatial
auto-correlation.
Get a labeled set of tracts
tracts_sp_named <- data.frame(tracts_sp,row.names=tracts_sp$fips)
lm_pw <- localmoran(tracts_sp$percent_white,ww)
lm_pw_named <- data.frame(lm_pw,row.names=tracts_sp$fips)
lm_pw_sp <- merge(tracts_sp_named, lm_pw_named, by=0)
percent_white <- summary(lm_pw_sp$percent_white)
zscores <- summary(lm_pw_sp$Z.Ii) # Z-scores for Local Moran's I
lm_pw_sp$sig <- lm_pw_sp$Pr.z....E.Ii.. < 0.05
lm_pw_sp$highv <- lm_pw_sp$percent_white > percent_white['Mean']
lm_pw_sp$lowv <- lm_pw_sp$percent_white < percent_white['Mean']
lm_pw_sp$highz <- lm_pw_sp$Z.Ii > zscores['Mean']
lm_pw_sp$lowz <- lm_pw_sp$Z.Ii < zscores['Mean']
lm_pw_sp$q <- 'ns'
Create a categorical column for easy mapping.
lm_pw_sp <- within(lm_pw_sp,{
q[sig & highv & highz]='HH'
q[sig & lowv & lowz]='LL'
q[sig & lowv & highz]='LH'
q[sig & highv & lowz]='HL'
})
Show percent_white clusters according to Local Moran’s I.
options(repr.plot.width = 16, repr.plot.height = 16)
ggplot() +
geom_sf(data=lm_pw_sp$geometry,aes(fill = lm_pw_sp$q)) +
scale_fill_manual(values=c('HH'="#e31a1c",'LL'="#CCCCFF", 'LH'="#1f78b4", 'HL'="#FFCCCC", 'ns'="#FFFFFF"))
options(repr.plot.width = 8, repr.plot.height = 8)
HH is where tracts that have percent white are surrounded by tracts with high percent white - a cluster. Same for LH. These are tracts where low percent white tracts are surrounded by low percent white tracts. Both the HL and LL are tracts that have either high or low percent white and are surrounded by the opposite - the anomalies.
lm_mi <- localmoran(tracts_sp$median_income,ww)
lm_mi_named <- data.frame(lm_mi,row.names=tracts_sp$fips)
lm_mi_sp <- merge(tracts_sp_named, lm_mi_named, by=0)
median_income <- summary(lm_mi_sp$median_income)
zscores <- summary(lm_mi_sp$Z.Ii) # Z-scores for Local Moran's I
Make the bins
lm_mi_sp$sig <- lm_mi_sp$Pr.z....E.Ii.. < 0.05
lm_mi_sp$highv <- lm_mi_sp$median_income > median_income['Mean']
lm_mi_sp$lowv <- lm_mi_sp$median_income < median_income['Mean']
lm_mi_sp$highz <- lm_mi_sp$Z.Ii > zscores['Mean']
lm_mi_sp$lowz <- lm_mi_sp$Z.Ii < zscores['Mean']
lm_mi_sp$q <- 'ns'
Create a categorical column for easy mapping
lm_mi_sp <- within(lm_mi_sp,{
q[sig & highv & highz]='HH'
q[sig & lowv & highz]='LH'
q[sig & lowv & lowz]='LL'
q[sig & highv & lowz]='HL'
})
Show percent_white clusters according to Local Moran’s I
options(repr.plot.width = 16, repr.plot.height = 16)
ggplot() +
geom_sf(data=lm_mi_sp$geometry,aes(fill = lm_mi_sp$q)) +
scale_fill_manual(values=c('HH'="#e31a1c",'LL'="#CCCCFF", 'LH'="#1f78b4", 'HL'="#FFCCCC", 'ns'="#FFFFFF"))
options(repr.plot.width = 8, repr.plot.height = 8)
HH is where tracts that have high income are surrounded by tracts with high income - a cluster. Same for LH. These are tracts where low income tracts are surrounded by low income tracts. Both the HL and LL are tracts that have either high or low median income surrounded by the opposite with no clustering - the anomalies.
Take a moment to look back and for between the maps. Do the clusters appear in the same location? Would your observations change the way you interpret the basic linear model?
Spatial lag regression provides an alternative model that takes into account the auto-correlation.
\(y = \rho W y + X \beta + \varepsilon\)
where \(\rho\) is found by
optimize() first, and \(\beta\) and
other
parameters by generalized least squares subsequently
(one-dimensional
search using optim performs badly on some platforms). In the
spatial
Durbin (mixed) model, the spatially lagged independent variables are
added to X.
lagged <- lagsarlm(
percent_white~median_income_of_residents,
data=tracts_sp,
listw=ww,
tol.solve=1.0e-30,
zero.policy=T)
summary(lagged)
##
## Call:lagsarlm(formula = percent_white ~ median_income_of_residents,
## data = tracts_sp, listw = ww, zero.policy = T, tol.solve = 1e-30)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5507676 -0.1014547 0.0081897 0.1123226 0.4481000
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.5622e-01 2.0501e-02 12.498 < 2.2e-16
## median_income_of_residents 1.8254e-06 1.8228e-07 10.014 < 2.2e-16
##
## Rho: 0.055588, LR test value: 164.76, p-value: < 2.22e-16
## Asymptotic standard error: 0.0045204
## z-value: 12.297, p-value: < 2.22e-16
## Wald statistic: 151.22, p-value: < 2.22e-16
##
## Log likelihood: 268.1141 for lag model
## ML residual variance (sigma squared): 0.0261, (sigma: 0.16155)
## Number of observations: 682
## Number of parameters estimated: 4
## AIC: -528.23, (AIC for lm: -365.47)
## LM test for residual autocorrelation
## test value: 262.04, p-value: < 2.22e-16
The lower AIC value for the spatial lag model relative to the
non-spatial model indicates that the spatial lag model has a
slightly
better fit than the non-spatial model.
Make a residuals plot
x = data.frame(actual = lagged$fitted.values + lagged$residuals, fitted = lagged$fitted.values)
x = x[order(x$fitted),]
plot(x$actual, col='#00000040')
lines(x$fitted, col='#c00000', lwd=3)
The lagged model and the linear model are actually still pretty close, BUT how does the cluster analysis influence how you interpret the model? It seems that there are other important factors to predict median income other than whiteness that we are not considering …. more work to do.
NOTE: This package does not constitute approval of
GWR
as a method of spatial analysis; see example(gwr)
In other words, everything below this point is experimental and not
really
working right or to be trusted if it actually runs ….
Spatially Weighted Regression is not a true regression
(i.e. predictive
model), but instead tries to predict where
non-stationarity is taking place on the map, that is where locally
weighted regression coefficients move away from their global values.
see this vignette:
https://cran.r-project.org/web/packages/spgwr/vignettes/GWR.html#Geographically_Weighted_Regression_1